# File: preprocess.py
# This file is used to generate the input graph file and the streaming file for the experiments.


import os
import networkx as nx
import sgtl.random


def generate_streaming(input_graph_file, streaming_file):
    '''
    This function generates a streaming file from an input graph file.
    '''
    with open(input_graph_file, "r") as f:
        if not f.readline().strip().isdigit():
            graph_file = os.path.join(os.path.dirname(input_graph_file), 'edges.txt')
            with open(graph_file, "w") as f1:
                for line in f:
                    f1.write(line.strip().replace(',', ' ') + '\n')
            graph = nx.read_edgelist(graph_file, nodetype=int, create_using=nx.Graph())
            print(f"Number of nodes: {graph.number_of_nodes()}")

        else:
            graph = nx.read_edgelist(input_graph_file, nodetype=int, create_using=nx.Graph())

    with open(streaming_file, "w") as f:
        # write the number of nodes in the first line
        f.write(f"{graph.number_of_nodes()}\n")
        for u in range(graph.number_of_nodes()):
            for v in range(u + 1, graph.number_of_nodes()):
                if graph.has_edge(u, v):
                    f.write(f"{u} {v} +\n")
                else:
                    f.write(f"{u} {v} -\n")


def generate_graph_from_SBM(vertices_num, p, edge_file):
    '''
    This function generates a graph from a SBM model and writes it to a file.
    '''
    cluster_sizes = [vertices_num // 2, vertices_num // 2]
    prob_mat_q = [[p, 1 - p], [1 - p, p]]

    graph = sgtl.random.sbm(cluster_sizes, prob_mat_q).to_networkx()

    with open(edge_file, "w") as f:
        for u, v in graph.edges():
            f.write(f"{u} {v}\n")


def generate_synthetic_datasets(data_dir, num_of_nodes, probability_list):
    '''
    This function generates the synthetic datasets (both the input graph and the streaming file).
    '''
    for num in num_of_nodes:
        for prob in probability_list:
            folder_path = os.path.join(data_dir, f'nodes_{num}/prob_{prob}')
            os.makedirs(folder_path, exist_ok=True)

            generate_graph_from_SBM(num, prob, folder_path + f'/edges.txt')
            generate_streaming(folder_path + f'/edges.txt', folder_path + f'/streaming.txt')


def relabel_subgraph(raw_edge_file, relabeled_edge_file):
    '''
    This function relabels subgraph to start from 0 and write it to a file.
    '''
    # read edgelist file and create a graph
    graph = nx.read_edgelist(path=raw_edge_file, nodetype=int, create_using=nx.Graph())

    # map nodes to a number starting from 0 and record all node numbers
    mapping = {node: index for index, node in enumerate(graph.nodes())}
    relabeled_graph = nx.relabel_nodes(graph, mapping)

    # write relabeled graph to file
    with open(relabeled_edge_file, "w") as f:
        for u, v in relabeled_graph.edges():
            f.write(f"{u} {v}\n")


def generate_first_type_datasets(data_dir, datasets):
    '''
    This function generates the first type of datasets (the streaming file). facebook
    '''
    for dataset in datasets:        
        # relabel the graph and write to file
        graph_file = os.path.join(data_dir, dataset, "edges.txt")
        relabel_subgraph(data_dir + "/" + dataset + "/" + dataset[len('facebook'):] + ".edges", graph_file)

        generate_streaming(graph_file, data_dir + "/" + dataset + "/streaming.txt")


if __name__ == '__main__':

    ## synthetic datasets
    # generate_synthetic_datasets('./SBM', [100, 500, 1000, 1500, 2000, 2500], [0.95, 0.9, 0.8, 0.7, 0.6, 0.5])

    ## first type datasets
    # generate_first_type_datasets('./facebook', ['facebook0', 'facebook414', 'facebook3980'])

    ## second type datasets
    # generate_streaming('./email-Eu-core/email-Eu-core.txt', './email-Eu-core/streaming.txt')
    # generate_streaming('./lastfm_asia/lastfm_asia_edges.csv', './lastfm_asia/streaming.txt')

    pass